import pandas as pd
              import numpy as np
              import matplotlib.pyplot as plt 
              import seaborn as sns 
              import os
              os.chdir("C:\\Users\\ASUS\\Desktop")
              data = pd.read_csv("clinic_data.csv")
              data.head(3)


              data.head(3)


              data.describe()


              n = data.nunique(axis=0)
              n

Age                           109
              Gender                          2
              AppointmentRegistration    295425
              ApointmentData                534
              DayOfTheWeek                    7
              Status                          2
              Diabetes                        2
              Alcoolism                       2
              HiperTension                    2
              Handcap                         5
              Smokes                          2
              Scholarship                     2
              Tuberculosis                    2
              Sms_Reminder                    3
              AwaitingTime                  213
              dtype: int64


              def features_plots(discrete_vars):
              
                  plt.figure(figsize=(20,30))
              
                  for i, cv in enumerate(['Age', 'AwaitingTime']):
                      plt.subplot(7, 2, i+1)
                      plt.hist(data[cv], bins=len(data[cv].unique()))
                      plt.title(cv)
                      plt.ylabel('Frequency')
              
                  for i, dv in enumerate(discrete_vars):
                      plt.subplot(7, 2, i+3)
                      data[dv].value_counts().plot(kind='bar', title=dv)
                      plt.ylabel('Frequency')
                      
              discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes','Alcoolism', 'HiperTension', 'Handcap', 'Smokes',
              'Scholarship', 'Tuberculosis', 'Sms_Reminder']
              features_plots(discrete_vars)


              data = data.loc[data['Age'] >= 0,]


              data.Handcap.value_counts(normalize=True)

0    0.981343
              1    0.016994
              2    0.001497
              3    0.000130
              4    0.000037
              Name: Handcap, dtype: float64


              data = data.drop("Handcap",axis=1)


              data["AwaitingTime"] = abs(data["AwaitingTime"])


              data.head(2)


              from sklearn import preprocessing
              le = preprocessing.LabelEncoder()
              data["Status"] = le.fit_transform(data["Status"])
              data["Gender"] = le.fit_transform(data["Gender"])


              dow_mapping = {'Monday' : 0, 'Tuesday' : 1, 'Wednesday' : 2, 'Thursday' : 3, 'Friday' : 4, 'Saturday' : 5, 'Sunday' : 6}
              data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping)


              discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes',
                                   'Alcoolism', 'HiperTension', 'Smokes',
                                       'Scholarship', 'Tuberculosis', 'Sms_Reminder']
              
              features_plots(discrete_vars)


              plt.figure(figsize=(15,5))
              sns.scatterplot(data=data,x="Age",y="AwaitingTime",hue="Status")
              plt.xlim(0, 120)
              plt.ylim(0, 120)
              plt.show()


              data_Analytics_Educator = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0)
              data_Analytics_Educator


              data_Analytics_Educator[[0, 1]].plot(kind='bar', stacked=True)
              plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent')
              plt.xlabel('Number of SMS reminders')
              plt.ylabel('Frequency')
              plt.show()


              data_AE = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)
              data_AE.plot(kind='bar', stacked=True)
              plt.title('Frequency of people showing up and not showing up by Day of the week')
              plt.xlabel('Day of the week')
              plt.ylabel('Frequency')
              plt.show()


              from datetime import date, time, datetime
              data["app_date"] = data["ApointmentData"].str[:10]
              data["app_date"] = pd.to_datetime(data["app_date"], format="%Y-%m-%d")
              data["app_year"] = data["app_date"].dt.year
              data["app_month"] = data["app_date"].dt.month


              # dropping the variables app_date and ApointmentData
              data = data.drop(["app_date","ApointmentData"],axis=1)


              # She decided to drop AppointmentRegistration as well, since it will be of no other use
              data = data.drop(["AppointmentRegistration"],axis=1)


              data.head()


              data = pd.get_dummies(data=data,columns=['app_year', 'app_month'],drop_first=True)


              # There are no missing values
              data.isnull().sum()

Age              0
              Gender           0
              DayOfTheWeek     0
              Status           0
              Diabetes         0
              Alcoolism        0
              HiperTension     0
              Handcap          0
              Smokes           0
              Scholarship      0
              Tuberculosis     0
              Sms_Reminder     0
              AwaitingTime     0
              app_year_2015    0
              app_month_2      0
              app_month_3      0
              app_month_4      0
              app_month_5      0
              app_month_6      0
              app_month_7      0
              app_month_8      0
              app_month_9      0
              app_month_10     0
              app_month_11     0
              app_month_12     0
              dtype: int64


              # We remove the label values from our training data
              X = data.drop(['Status'],axis=1)
              
              # We assigned those label values to our Y dataset
              y = data['Status']


              # Split it to a 70:30 Ratio Train:Test
              from sklearn.model_selection import train_test_split
              X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


              from sklearn.tree import DecisionTreeClassifier
              classifier = DecisionTreeClassifier()
              classifier.fit(X_train, y_train)

DecisionTreeClassifier()


              #predict the test data
              y_pred = classifier.predict(X_test)
              from sklearn.metrics import classification_report, confusion_matrix
              print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support
              
                         0       0.33      0.36      0.34     27214
                         1       0.71      0.68      0.69     62785
              
                  accuracy                           0.58     89999
                 macro avg       0.52      0.52      0.52     89999
              weighted avg       0.59      0.58      0.59     89999


              from sklearn.ensemble import RandomForestClassifier
              rf = RandomForestClassifier()
              rf.fit(X_train, y_train)

RandomForestClassifier()


              y_pred = rf.predict(X_test)
              from sklearn.metrics import classification_report, confusion_matrix
              print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support
              
                         0       0.35      0.23      0.28     27214
                         1       0.71      0.82      0.76     62785
              
                  accuracy                           0.64     89999
                 macro avg       0.53      0.52      0.52     89999
              weighted avg       0.60      0.64      0.61     89999


              from sklearn.ensemble import GradientBoostingClassifier
              clf = GradientBoostingClassifier()
              clf.fit(X_train, y_train)
              y_pred = clf.predict(X_test)


              print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support
              
                         0       0.01      0.52      0.03       780
                         1       0.99      0.70      0.82     89219
              
                  accuracy                           0.70     89999
                 macro avg       0.50      0.61      0.43     89999
              weighted avg       0.99      0.70      0.81     89999

	Age	Diabetes	Alcoolism	HiperTension	Handcap	Smokes	Scholarship	Tuberculosis	Sms_Reminder	AwaitingTime
count	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000
mean	37.808017	0.077967	0.025010	0.215890	0.020523	0.052370	0.096897	0.000450	0.574173	-13.841813
std	22.809014	0.268120	0.156156	0.411439	0.155934	0.222772	0.295818	0.021208	0.499826	15.687697
min	-2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-398.000000
25%	19.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-20.000000
50%	38.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	-8.000000
75%	56.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	-4.000000
max	113.000000	1.000000	1.000000	1.000000	4.000000	1.000000	1.000000	1.000000	2.000000	-1.000000

Status	0	1
Sms_Reminder
0	38915	89631
1	51546	119103
2	268	531

	Age	Gender	AppointmentRegistration	ApointmentData	DayOfTheWeek	Status	AwaitingTime
0	19	M	2014-12-16T14:46:25Z	2015-01-14T00:00:00Z	Wednesday	Show-Up	-29
1	24	F	2015-08-18T07:01:26Z	2015-08-19T00:00:00Z	Wednesday	Show-Up	-1
2	4	F	2014-02-17T12:53:46Z	2014-02-18T00:00:00Z	Tuesday	Show-Up	-1

	Age	Gender	DayOfTheWeek	Status	Sms_Reminder	AwaitingTime	app_year	app_month
0	19	1	2	1	0	29	2015	1
1	24	0	2	1	0	1	2015	8
2	4	0	1	1	0	1	2014	2
3	5	1	3	1	1	15	2014	8
4	38	1	1	1	1	6	2015	10

Loss curtailing at Calcutta Medical Clinic¶

To further investigate, Dr. Joyita Sanyal obtains the past data of the clinic about their transactions.¶

Get the dataset¶

Dr. Joyita Sanyal seeks help from Analytics Educator to deep dive into the data to extract insights out of it.¶

Data Dictionary for the clinic data¶

Age¶

Gender¶

AppointmentRegistration¶

ApointmentData¶

DayOfTheWeek¶

Status¶

Diabetes¶

Alcoolism¶

HiperTension¶

Handicap¶

Smokes¶

Tuberculosis¶

Scholarship¶

Sms_Reminder¶

AwaitingTime¶

Dr. Joyita was hoping to do the following with the information from the data dump:¶

She observes the following about the variables:¶

Integer:¶

String:¶

Datetime:¶

Boolean:¶

Descriptive Analytics¶

Show the number of unique values by variables¶

Create visualization to gain insights¶

Looking at the plots Dr. Joyita realized the following:¶

Data Cleaning¶

Presence of negative values within "Age" which didn't make any intuitive sense to her, and thus she referred to these as noise and decided to delete these rows.¶

Handicap was supposed to be a binary variable. However, Dr. Joyita noticed that 0 was consisting of more than 98% of the total count. It is concluded that this variable doesn't have enough variance to have any impact, hence it will be dropped from the data.¶

Dr. Joyita also recalled that some values in waiting time had appeared to be negative, and hence it made sense to turn them into a positive value.¶

Dr. Joyita recalls that she had read that Machine learning works best with numbers rather than strings. Hence, she decides to convert the string variable such as Gender, Day of the week, and Status into numbers.¶

Analytics Educator had advised Dr. Joyita not to use the same technique on day of the week since it converts the string into numeric as per their alphabetical order. E.g. the Friday would have been coded into 0, Monday as 1, Saturday as 2.¶

Hence it was decided to use the mapping function to convert the day of the week into numbers¶

Now Dr. Joyita will check back the visualization once again.¶

Exploratory Data Analysis¶

She is disappointed to see that her hypothesis was wrong, and no such correlation among the variables exist.¶

Now she is interested to see if sms reminder increases the chance of people to show up.¶

So it seems that sms reminder does increase, though marginally, the likelyhood of a patient to turn up¶

Now she is interested to see if different days of the week impacts the chance of people to show up.¶

Main reason for loosing money¶

Hence, Dr. Joyita decides to build a Machine Learning algorithm, along with the help of Analytics Educator¶

app_year and app_month is a categorical variable. Simply it means one value is not of higher weightage than others. Eg. app_month value 2 doesn't mean that it's double of app_month value 1. Hence, they need to be converted into a dummy variable¶

Now Dr. Joyita also checks if there are any missing values in the data¶

Classification¶

Model Evaluation Techniques¶

Confusion Matrix¶

Dr. Joyita writes the following code to run the Machine Learning algorithms¶

Data split : Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (70% of the data) and rest 30% - named test, will be kept aside for later use.¶

Decision Tree Classification¶

Predict the results¶

Interpretation¶

Dr. Joyita is not very happy with this result, hence she decides to try another method, an ensemble technique.¶

Ensemble Methods¶

Bagging¶

Boosting¶

Random Forest Classification¶

Predict the result¶

Here the precision has improved but the recall performance has gone down. We may try one more algorithm to improve the accuracy¶

Gradient Boosting¶

Predict the result¶